Dummy Variables

November 6, 2024

Dummy Variables: Binary Indep. Variables

library(haven)
library(tidyverse)
library(marginaleffects) 
library(jtools)

states <- read_dta("states.dta")
nes <- read_dta("nes.dta")

Dummy Variables: Binary Indep. Variables

  • Let’s look at the “south” variable in the states data
# Frequency distribution, south
states |> group_by(south) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 2 × 3
  south            n   pct
  <dbl+lbl>    <int> <dbl>
1 0 [Nonsouth]    34    68
2 1 [South]       16    32

Dummy Variables: Binary Indep. Variables

  • Comparing Turnout by South v. non-South
# Turnout, South v. non-South
states |> group_by(south) |> summarize(avg=mean(vep16_turnout))
# A tibble: 2 × 2
  south          avg
  <dbl+lbl>    <dbl>
1 0 [Nonsouth]  63.0
2 1 [South]     58.7

Dummy Variables: Binary Indep. Variables

# Difference of means test
t.test(vep16_turnout ~ south, data = states, var.equal = TRUE)

    Two Sample t-test

data:  vep16_turnout by south
t = 2.3273, df = 48, p-value = 0.02421
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 0.5865435 8.0340480
sample estimates:
mean in group 0 mean in group 1 
       63.03529        58.72500 

Dummy Variables: Binary Indep. Variables

# Simple (bivariate) regression
summary(lm(vep16_turnout ~ south, data=states), digits=4)

Call:
lm(formula = vep16_turnout ~ south, data = states)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.0353  -4.3103   0.5699   4.6147  11.7647 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   63.035      1.048  60.167   <2e-16 ***
south         -4.310      1.852  -2.327   0.0242 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.109 on 48 degrees of freedom
Multiple R-squared:  0.1014,    Adjusted R-squared:  0.08268 
F-statistic: 5.417 on 1 and 48 DF,  p-value: 0.02421

Dummy Variables: Binary Indep. Variables

# Multiple regression
summary(lm(vep16_turnout ~ south + ba_or_more_2015 + hispanicpct_2016, data=states),
     digits=4)

Call:
lm(formula = vep16_turnout ~ south + ba_or_more_2015 + hispanicpct_2016, 
    data = states)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.6360  -2.1145  -0.0495   2.3282   8.9824 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      44.65209    4.89562   9.121 6.98e-12 ***
south            -2.15840    1.66087  -1.300  0.20023    
ba_or_more_2015   0.66923    0.15497   4.319 8.29e-05 ***
hispanicpct_2016 -0.19519    0.07114  -2.744  0.00863 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.053 on 46 degrees of freedom
Multiple R-squared:  0.4109,    Adjusted R-squared:  0.3725 
F-statistic:  10.7 on 3 and 46 DF,  p-value: 1.879e-05

Nominal Indep. Variables

# Nominal independent variables - 3 category race variable
nes |> group_by(Race3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 4 × 3
  Race3             n   pct
  <dbl+lbl>     <int> <dbl>
1  1 [White]     3038 71.1 
2  2 [Black]      398  9.32
3  3 [Hispanic]   450 10.5 
4 NA              385  9.01

Nominal Indep. Variables

  • Average thermometer rating for Democratic party by race
# Averages by race (evaluations of Democratic party)
nes |> 
  group_by(Race3) |>
  summarize(avg=mean(ft_Dem, na.rm=TRUE))
# A tibble: 4 × 2
  Race3           avg
  <dbl+lbl>     <dbl>
1  1 [White]     42.7
2  2 [Black]     73.9
3  3 [Hispanic]  59.8
4 NA             50.9

Nominal Indep. Variables

# Averages by race (evaluations of Democratic party)
# Remove missing data 2: drop_na 
nes |> 
  drop_na(Race3, ft_Dem) |> 
  group_by(Race3) |>
  summarize(avg=mean(ft_Dem))
# A tibble: 3 × 2
  Race3          avg
  <dbl+lbl>    <dbl>
1 1 [White]     42.7
2 2 [Black]     73.9
3 3 [Hispanic]  59.8

Nominal Indep. Variables

# Revisiting data viz: PRE=ESTIMATION, bivariate plots
# geom_col, use haven's "as_factor"
nes |> drop_na(Race3, ft_Dem) |> group_by(Race3) |>
  summarize(avg=mean(ft_Dem)) |> 
  ggplot(aes(x=as_factor(Race3), y=avg)) + 
  geom_col(width=.3) + 
  labs(x=NULL, y="Thermometer: Democrats")

Nominal Indep. Variables

# Are those differences statistically significant? 
# "Dummying out" method for regression specification - manually
nes <- nes |> mutate(white = ifelse(Race3==1, 1, 0), 
                  black = ifelse(Race3==2, 1, 0),
                  hisp = ifelse(Race3==3, 1, 0)
                  )

Nominal Indep. Variables

nes |> drop_na(Race3) |> 
  group_by(Race3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 3 × 3
  Race3            n   pct
  <dbl+lbl>    <int> <dbl>
1 1 [White]     3038  78.2
2 2 [Black]      398  10.2
3 3 [Hispanic]   450  11.6
nes |> drop_na(white) |> 
  group_by(white) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 2 × 3
  white     n   pct
  <dbl> <int> <dbl>
1     0   848  21.8
2     1  3038  78.2

Nominal Indep. Variables

nes |> drop_na(Race3) |> 
  group_by(Race3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 3 × 3
  Race3            n   pct
  <dbl+lbl>    <int> <dbl>
1 1 [White]     3038  78.2
2 2 [Black]      398  10.2
3 3 [Hispanic]   450  11.6
nes |> drop_na(black) |> 
  group_by(black) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 2 × 3
  black     n   pct
  <dbl> <int> <dbl>
1     0  3488  89.8
2     1   398  10.2

Nominal Indep. Variables

nes |> drop_na(Race3) |> 
  group_by(Race3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 3 × 3
  Race3            n   pct
  <dbl+lbl>    <int> <dbl>
1 1 [White]     3038  78.2
2 2 [Black]      398  10.2
3 3 [Hispanic]   450  11.6
nes |> drop_na(hisp) |> 
  group_by(hisp) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 2 × 3
   hisp     n   pct
  <dbl> <int> <dbl>
1     0  3436  88.4
2     1   450  11.6

Nominal Indep. Variables

# Change baselines
#Set white as baseline category
summary(lm(ft_Dem ~ black + hisp, data=nes))

Call:
lm(formula = ft_Dem ~ black + hisp, data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  42.7405     0.5177   82.55   <2e-16 ***
black        31.1787     1.5148   20.58   <2e-16 ***
hisp         17.0284     1.4507   11.74   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

# Set Hispanic as baseline category
summary(lm(ft_Dem ~ black + white, data=nes))

Call:
lm(formula = ft_Dem ~ black + white, data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   59.769      1.355  44.104  < 2e-16 ***
black         14.150      1.965   7.199 7.24e-13 ***
white        -17.028      1.451 -11.738  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

# Set Black as baseline category
summary(lm(ft_Dem ~ hisp + white, data=nes))

Call:
lm(formula = ft_Dem ~ hisp + white, data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   73.919      1.424  51.924  < 2e-16 ***
hisp         -14.150      1.965  -7.199 7.24e-13 ***
white        -31.179      1.515 -20.582  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

#Use built-in "as.factor" to dummy out race in the lm commmand
summary(lm(ft_Dem ~ as.factor(Race3), data=nes))

Call:
lm(formula = ft_Dem ~ as.factor(Race3), data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)        42.7405     0.5177   82.55   <2e-16 ***
as.factor(Race3)2  31.1787     1.5148   20.58   <2e-16 ***
as.factor(Race3)3  17.0284     1.4507   11.74   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

#Use built-in "as.factor" to dummy out race in the lm commmand
summary(lm(ft_Dem ~ relevel(as.factor(Race3), ref="3"), data=nes))

Call:
lm(formula = ft_Dem ~ relevel(as.factor(Race3), ref = "3"), data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
                                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)                             59.769      1.355  44.104  < 2e-16 ***
relevel(as.factor(Race3), ref = "3")1  -17.028      1.451 -11.738  < 2e-16 ***
relevel(as.factor(Race3), ref = "3")2   14.150      1.965   7.199 7.24e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

#Use built-in "as.factor" to dummy out race in the lm commmand
summary(lm(ft_Dem ~ relevel(as.factor(Race3), ref="2"), data=nes))

Call:
lm(formula = ft_Dem ~ relevel(as.factor(Race3), ref = "2"), data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-73.919 -26.740   0.231  23.156  57.260 

Coefficients:
                                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)                             73.919      1.424  51.924  < 2e-16 ***
relevel(as.factor(Race3), ref = "2")1  -31.179      1.515 -20.582  < 2e-16 ***
relevel(as.factor(Race3), ref = "2")3  -14.150      1.965  -7.199 7.24e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.33 on 3824 degrees of freedom
  (444 observations deleted due to missingness)
Multiple R-squared:  0.1177,    Adjusted R-squared:  0.1172 
F-statistic:   255 on 2 and 3824 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

  • Post-estimation from multiple regression model.
# Specify Race3 using factor; need this for post-estimation
a <- lm(ft_Dem ~ as_factor(Race3) + libcon7 + 
          Female + educ4, data=nes)
summary(a, digits=3)

Call:
lm(formula = ft_Dem ~ as_factor(Race3) + libcon7 + Female + educ4, 
    data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-90.976 -15.510   1.842  15.775  77.511 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)               88.6869     1.7027  52.087  < 2e-16 ***
as_factor(Race3)Black     24.5347     1.2952  18.943  < 2e-16 ***
as_factor(Race3)Hispanic  14.0080     1.2418  11.281  < 2e-16 ***
libcon7                  -10.8536     0.2645 -41.038  < 2e-16 ***
Female                     3.3306     0.7729   4.309 1.68e-05 ***
educ4                     -0.5379     0.3891  -1.383    0.167    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 23.36 on 3698 degrees of freedom
  (567 observations deleted due to missingness)
Multiple R-squared:  0.4026,    Adjusted R-squared:  0.4018 
F-statistic: 498.5 on 5 and 3698 DF,  p-value: < 2.2e-16

Nominal Indep. Variables

  • Post-estimation from multiple regression model.
# Use marginaleffects package, save predicted values, i.e., y-hats ("prediction")
race <- predictions(
  a,
  by="Race3",
  df = insight::get_df(a),
  numderiv = "richardson",
  newdata = datagrid(
    Race3=unique, grid_type="counterfactual"))

Nominal Indep. Variables

  • Graph those post-estimation results.
# Dots and spikes
race |> 
  ggplot(aes(y=estimate, x=as.factor(Race3), ymin = conf.low, ymax = conf.high)) + 
  geom_point(size=3) +
  geom_errorbar(width=0) +
  labs(x=NULL, y="Predicted Democratic Thermometer") +
  scale_x_discrete(breaks=c("1","2","3"),
                   labels=c("White", "Black", "Hispanic"))

Nominal Indep. Variables

  • Graph those post-estimation results.
# Bar graph with labels
race |> 
  ggplot(aes(y=estimate, x=as.factor(Race3), ymin = conf.low, ymax = conf.high,
             label = round(estimate, digits=2))) + 
  geom_bar(stat = "identity", alpha=.8, width = .5) +
  geom_errorbar(width=.08, size=.2) +
  labs(x=NULL, y="Predicted Democratic Thermometer") +
  scale_x_discrete(breaks=c("1","2","3"),
                   labels=c("White", "Black", "Hispanic")) +
  geom_text(vjust = 6, color="white", size=3.5)